M <- 100
out <- rep(NA, M)
for (m in 1:M){
lab = sample(c("Female", "Male"), size = 1, prob = c(.4,.6))
if (lab == "Female")
out[m] <- rnorm(n = 1, mean = 162, sd = sqrt(5))
else
out[m] <- rnorm(n = 1, mean = 176, sd = sqrt(8))
}
hist(out, main = "", xlab = "")
?rgb
colos <- c(rgb(32/255, 74/255, 135/255, 0.7),
rgb(204/255, 0, 0, 0.7),
rgb(200/255, 141/255, 0, 0.7),
rgb(78/255, 153/255, 6/255, 0.7),
rgb(32/255, 74/255, 135/255, 0.3),
rgb(204/255, 0, 0, 0.3),
rgb(200/255, 141/255, 0, 0.3),
rgb(78/255, 153/255, 6/255, 0.3))
suppressMessages(require(mixtools, quietly = T))

?rnormmix
n <- 250
XX <- rnormmix(n,
lambda = c(0.5, rep(0.1,5)),
mu = c(0, ((0:4)/2)-1),
sigma = c(1, rep(0.1,5)) )
hist(XX, prob = T, col = gray(.8), border = NA, xlab = "x",
main = paste("Data from Bart's density",sep=""),
sub = paste("n = ", n, sep = ""),
breaks = 50)
rug(XX, col = rgb(0,0,0,.5))
true.den = function(x) 0.5*dnorm(x, 0, 1) +
0.1*dnorm(x,-1.0, 0.1) + 0.1*dnorm(x, -0.5, 0.1) +
0.1*dnorm(x, 0.0, 0.1) + 0.1*dnorm(x, 0.5, 0.1) +
0.1*dnorm(x, 1.0, 0.1)
curve(true.den, col = rgb(1,0,0,0.4), lwd = 3, n = 500, add = TRUE)
lines(density(XX), col = colos[3], lwd = 3)
lines(density(XX, bw = .08), col = colos[4], lwd = 3)
lines(density(XX, bw = .008), col = colos[5], lwd = 3)
legend("topright", c("True","Over", "Just right", "Under"), lwd = 5,
col = c(rgb(1,0,0,0.4), colos[3], colos[4],colos[5]), cex = 0.8, bty = "n")

handmade.em <- function(y, p, mu, sigma, n_iter, plot_flag = T)
{
cols <- c(rgb(1,0,0,.3), rgb(0,1,0,.3))
like <- p[1]*dnorm(y, mu[1], sigma[1]) + p[2]*dnorm(y, mu[2], sigma[2])
deviance <- -2*sum(log(like))
res <- matrix(NA,n_iter + 1, 8)
res[1,] <- c(0, p, mu, sigma, deviance)
for (iter in 1:n_iter) {
d1 <- p[1]*dnorm(y, mu[1], sigma[1])
d2 <- p[2]*dnorm(y, mu[2], sigma[2])
r1 <- d1/(d1 + d2)
r2 <- 1 - r1
p[1] <- mean(r1)
mu[1] <- sum(r1*y)/sum(r1)
sigma[1] <-sqrt( sum(r1*(y^2))/sum(r1) - (mu[1])^2 )
p[2] <- 1 - p[1]
mu[2] <- sum((r2)*y)/sum((r2))
sigma[2] <- sqrt(sum(r2*(y^2))/sum(r2) - (mu[2])^2)
like <- p[1]*dnorm(y, mu[1], sigma[1]) + p[2]*dnorm(y, mu[2], sigma[2])
deviance <- -2*sum( log(like) )
res[iter+1,] <- c(iter, p, mu, sigma, deviance)
if (plot_flag){
hist(y, prob = T, breaks = 30, col = gray(.8), border = NA,
main = "", xlab = paste("EM Iteration: ", iter, "/", n_iter, sep = ""))
set.seed(123)
points(jitter(y), rep(0,length(y)),
pch = 19, cex = .6,
col = cols[ (dnorm(y,mu[1],sigma[1]) > dnorm(y,mu[2],sigma[2])) + 1])
curve(p[1]*dnorm(x,mu[1],sigma[1]) + p[2]*dnorm(x,mu[2],sigma[2]),
lwd = 4, col = rgb(0,0,0,.5), add = TRUE)
Sys.sleep(1.5)
}
}
res <- data.frame(res)
names(res) <- c("iteration","p1","p2","mu1","mu2","sigma1","sigma2","deviance")
out <- list(parameters = c(p = p, mu = mu, sigma = sigma), deviance = deviance, res = res)
return(out)
}
data("faithful")
?faithful
hem_fit <- handmade.em(faithful$waiting,
p = c(.5,.5),
mu = c(45,55),
sigma = c(8,8),
n_iter = 20)




















round( hem_fit$parameters, 3 )
## p1 p2 mu1 mu2 sigma1 sigma2
## 0.360 0.640 54.596 80.079 5.855 5.880
hem_fit$deviance
## [1] 2068.005
require(ggplot2)
## Loading required package: ggplot2
require(gridExtra)
## Loading required package: gridExtra
plotConvMC <- function(df, title = NULL)
{
G <- (ncol(df)-2)/3
df$rep <- as.factor(df$rep)
graf <- vector("list", ncol(df)-2)
for (j in (2:(ncol(df)-1))) {
grafj <- ggplot(df) + geom_line(aes_string(df[,1],df[,j],
color = df[,ncol(df)])) +
xlab("iteration") + ylab(names(df[j])) + theme(legend.position = "none")
graf[[j-1]] <- grafj
}
do.call("grid.arrange", c(graf, ncol = 3, top = title))
}
D.em <- NULL
set.seed(1234)
for (m in (1:10)) {
p1 <- runif(1,0.1,0.9)
df.em <- handmade.em(faithful$waiting,
p = c(p1, 1 - p1),
mu = rnorm(2, 70, 15), sigma = rlnorm(2, 2, 0.7),
n_iter = 50, plot_flag = F)$res
df.em$rep <- m
D.em <- rbind(D.em,df.em)
}
plotConvMC(D.em)

handmade.em(faithful$waiting, p = c(0.5, 0.5), mu = c(30, 40), sigma = c(2, 10), n_iter = 5)





## $parameters
## p1 p2 mu1 mu2 sigma1 sigma2
## NaN NaN NaN NaN NaN NaN
##
## $deviance
## [1] NaN
##
## $res
## iteration p1 p2 mu1 mu2 sigma1 sigma2
## 1 0 5.000000e-01 0.5000000 30.00000 40.00000 2.000000e+00 10.00000
## 2 1 1.290643e-11 1.0000000 43.00624 70.89706 1.130252e-01 13.56996
## 3 2 4.706595e-11 1.0000000 43.00000 70.89706 8.259062e-07 13.56996
## 4 3 2.337458e-05 0.9999766 43.00000 70.89771 8.259062e-07 13.56945
## 5 4 3.675314e-03 0.9963247 43.00000 70.99997 0.000000e+00 13.48858
## 6 5 NaN NaN NaN NaN NaN NaN
## deviance
## 1 5227.041
## 2 2190.578
## 3 2190.565
## 4 2174.461
## 5 -Inf
## 6 NaN
?normalmixEM
mt_fit = normalmixEM(faithful$waiting)
## number of iterations= 24
summary(mt_fit)
## summary of normalmixEM object:
## comp 1 comp 2
## lambda 0.360887 0.639113
## mu 54.614891 80.091091
## sigma 5.871244 5.867717
## loglik at estimate: -1034.002
names(mt_fit)
## [1] "x" "lambda" "mu" "sigma" "loglik"
## [6] "posterior" "all.loglik" "restarts" "ft"
plot(mt_fit)

class(mt_fit)
## [1] "mixEM"
?plot.mixEM
plot(mt_fit, density = TRUE, w = 1.1)

(mt_fit$posterior[,1] < 0.5) + 1
## [1] 2 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 1 2 1 2 1 1 2 2 2 2 1 2 2 2 2 2 1 2 2 1 1
## [38] 2 1 2 2 1 2 1 2 2 1 1 2 1 2 2 1 2 1 2 2 1 2 2 1 2 1 2 1 2 2 2 1 2 2 1 2 2
## [75] 1 2 1 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 2 1 2 1 2 2 2
## [112] 1 2 2 1 2 1 2 1 2 1 2 2 1 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 2 2 2 1 2 1
## [149] 2 1 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 2 2 2 2 1 2 2 1 2 2 2 1
## [186] 2 2 1 2 1 2 1 2 2 2 2 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2
## [223] 1 2 2 2 2 2 2 2 2 1 2 1 2 1 1 2 2 1 2 1 2 1 2 2 1 2 2 2 1 2 2 2 2 2 2 2 1
## [260] 2 2 2 1 2 1 1 2 2 1 2 1 2
?kmeans
km_fit <- kmeans( faithful$waiting, centers = c(45, 55) )
names(km_fit)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
list(p = km_fit$size/sum(km_fit$size),
mu = as.vector(km_fit$centers),
sigma = sqrt(km_fit$withinss/km_fit$size)
)
## $p
## [1] 0.3676471 0.6323529
##
## $mu
## [1] 54.75000 80.28488
##
## $sigma
## [1] 5.865791 5.610953
list(p = mt_fit$lambda, mu = mt_fit$mu, sigma = mt_fit$sigma)
## $p
## [1] 0.3608869 0.6391131
##
## $mu
## [1] 54.61489 80.09109
##
## $sigma
## [1] 5.871244 5.867717
suppressMessages(require(plotly, quietly = T))
?MASS::geyser
dd <- MASS::geyser
kd <- with(MASS::geyser, MASS::kde2d(duration, waiting, n = 50))
p <- plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()
p
suppressMessages(require(mclust, quietly = T))
?mclust
mod1 = Mclust(dd)
mod1
## 'Mclust' model object: (VVI,4)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 4 components:
##
## log-likelihood n df BIC ICL
## -1330.13 299 19 -2768.568 -2798.746
##
## Clustering table:
## 1 2 3 4
## 90 17 98 94
mod1$classification
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 4 3 1 1 4 3 1 4 3 4 3 4 3 1 4 3 4 3 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 4 3 4 3 4 3 2 1 1 1 1 1 4 3 2 3 4 3 4 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 4 3 4 3 4 3 4 3 4 3 2 3 4 3 4 3 1 2 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 4 3 4 3 4 3 4 3 1 4 3 4 3 1 1 4 3 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 4 3 2 1 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 4 3 4 3 4 3 4 3 2 3 1 4 3 4 3 4 3 4 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 4 3 1 1 4 3 1 1 1 1 1 1 2 3 1 1 1 1 4 3
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 1 1 1 1 1 1 4 3 2 3 4 3 4 3 4 3 4 1 1 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 4 3 4 3 4 3 2 3 2 1 4 3 2 3 4 3 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 4 3 4 3 1 1 1 4 3 4 3 4 3 4 3 2 1 4 3 4
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
## 3 4 3 1 4 1 1 2 1 1 1 4 3 4 3 4 3 2 1 1
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 4 3 1 1 1 1 1 1 4 3 4 3 4 3 1 1 1 4 3 1
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
## 4 3 2 1 4 3 2 4 3 4 3 1 1 4 3 4 3 1 1 1
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
## 1 4 3 1 1 4 3 4 3 2 3 1 4 3 4 3 1 1 1 1
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
## 1 1 1 4 1 4 3 4 3 4 3 4 3 4 3 4 3 1 4
class(mod1)
## [1] "Mclust"
?plot.Mclust
par(mfrow = c(2,2))
plot(mod1, what = "uncertainty")
plot(mod1, what = "classification")
plot(mod1, what = "density")
plot(mod1, what = "BIC")

par(mfrow = c(1,1))
set.seed(123)
n <- 500
radius <- 2
angle <- runif(n)*pi*2
x <- cos(angle)*radius
y <- sin(angle)*radius
SD <- .15
xn <- x + rnorm(n, sd = SD)
yn <- y + rnorm(n, sd = SD)
XX <- cbind(xn, yn)
plot(x,y, pch = 19, col = "black", asp = 1, cex = .5)
points(xn, yn, pch = 19, col = "red", cex = .5)

mod2 = Mclust(cbind(xn,yn))
mod2
## 'Mclust' model object: (EEV,8)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
summary(mod2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 8 components:
##
## log-likelihood n df BIC ICL
## -1013.33 500 33 -2231.742 -2373.283
##
## Clustering table:
## 1 2 3 4 5 6 7 8
## 76 45 76 58 45 69 66 65
par(mfrow = c(2,2))
plot(mod2, what = "uncertainty")
plot(mod2, what = "classification")
plot(mod2, what = "density")
plot(mod2, what = "BIC")

par(mfrow = c(1,1))
suppressWarnings(require(pracma, quietly = T))
?distmat
?kmeans
kmpp <- function(X, k) {
n <- nrow(X)
C <- numeric(k)
C[1] <- sample(1:n, 1)
for (i in 2:k) {
dm <- distmat(X, X[C, ])
pr <- apply(dm, 1, min); pr[C] <- 0
C[i] <- sample(1:n, 1, prob = pr)
}
kmeans(X, X[C, ])
}
suppressMessages(require(factoextra, quietly = T))
library(help = factoextra)
?fviz_cluster
ddmat <- as.matrix(dd)
km3 <- kmpp(ddmat, k = 3)
fviz_cluster(km3, data = ddmat,
geom = "point",
ellipse.type = "norm",
palette = "Dark2",
ggtheme = theme_minimal())

km4 <- kmpp(ddmat, k = 4)
fviz_cluster(km4, data = ddmat,
geom = "point",
ellipse.type = "norm",
palette = "Dark2",
ggtheme = theme_minimal())

?fviz_nbclust
fviz_nbclust(ddmat, kmeans, method = "wss")

fviz_nbclust(ddmat, kmeans, method = "silhouette")

fviz_nbclust(ddmat, kmeans, method = "gap_stat")
